Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst'
'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
| match('SKILLS_NAME') \
| match('SPECIALIZED_SKILLS_NAME')
eda['DATA_ANALYST_JOB'].value_counts()DATA_ANALYST_JOB
False 38212
True 33042
Name: count, dtype: int64
import plotly.graph_objects as go
from plotly.subplots import make_subplots
df_grouped = (
eda
.groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
.size()
.reset_index(name='Job_Count')
)
short_names = {
'Professional, Scientific, and Technical Services': 'Prof. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})
pivot = (
df_grouped
.pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
.reset_index()
)
industries = pivot['Industry'].tolist()
y_true = pivot['True'].tolist()
y_false = pivot['False'].tolist()
# 2) Build a 2-row subplot: bar on top, table below
fig = make_subplots(
rows=2, cols=1,
row_heights=[0.70, 0.30], # give a bit more room to the table
specs=[[{"type":"bar"}],[{"type":"table"}]],
vertical_spacing=0.12 # more space between bar and table
)
colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}
fig.add_trace(
go.Bar(
x=industries, y=y_true, name='True',
marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
text=y_true, textposition='outside'
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=industries, y=y_false, name='False',
marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
text=y_false, textposition='outside'
),
row=1, col=1
)
# 3) Slider steps: 0 → 8 000 in 200s
steps = []
for val in range(0, 8001, 200):
steps.append(dict(
label=str(val),
method="update",
args=[
{"y": [
[v if v>=val else 0 for v in y_true],
[v if v>=val else 0 for v in y_false]
]},
{"title": f"Min Jobs ≥ {val:,}"}
]
))
# 4) Final layout tweaks
fig.update_layout(
# lift slider above everything
sliders=[dict(
active=0,
currentvalue={"prefix":"Min Jobs: "},
pad={"b":0},
x=0.05,
y=1.05, # move slider way above the plot area
xanchor="left",
yanchor="bottom",
len=0.7,
font=dict(color='#A81D1D'),
steps=steps
)],
title=dict(
text="Data & Business Analytics Job Trends",
font=dict(size=24, color='#A81D1D'),
x=0.5,
y=0.95, # drop the title just below the slider
xanchor="center",
yanchor="top"
),
width=1100, height=850,
margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(
title="Industry",
title_font=dict(size=16, color='#A81D1D'),
tickmode='array',
tickvals=list(range(len(industries))),
ticktext=industries,
tickangle=-30,
tickfont=dict(size=11, color='#333'),
showline=True, linecolor='#A81D1D'
),
yaxis=dict(
title="Number of Jobs",
title_font=dict(size=16, color='#A81D1D'),
tickfont=dict(size=11, color='#333'),
gridcolor='rgba(200,200,200,0.3)',
showline=True, linecolor='#A81D1D',
range=[0, max(max(y_true),max(y_false))*1.2]
),
legend=dict(
title="Data Analyst Job",
title_font=dict(color='#A81D1D'),
font=dict(size=12),
x=0.95, y=0.95
),
bargap=0.2
)
fig.show()
fig.write_html(
"figures/edaplot1.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',') # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()
# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')
# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]
# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)
# Step 6: Create the bar plot
fig = px.bar(
df_skills_top,
x='Count',
y='SPECIALIZED_SKILLS_NAME',
color='Job_Category',
barmode='stack',
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
title='Top 10 Specialized Skills by Job Category'
)
# Step 7: Update layout for styling
fig.update_layout(
width=900,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
xaxis=dict(
title='Number of Jobs',
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor='#E2E8F0',
linecolor='#2D3748',
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title='Specialized Skill',
title_font=dict(size=16),
tickfont=dict(size=12)
),
legend=dict(
title='Job Category',
font=dict(size=13),
bgcolor='#FFFFFF',
bordercolor='#FF6B6B',
borderwidth=1,
x=1.02,
y=0.5,
xanchor='left',
yanchor='middle'
)
)
fig.show()
# Save to HTML
fig.write_html(
'figures/edaplot2.html',
include_plotlyjs='cdn',
full_html=False
)Number of rows after cleaning: 71254
Top 10 specialized skills:
SPECIALIZED_SKILLS_NAME Job_Category Count
2071 "Business Intelligence" Analytics Job 8077
2072 "Business Intelligence" Non-Analytics Job 1778
2167 "Business Process" Analytics Job 4417
2168 "Business Process" Non-Analytics Job 8385
2180 "Business Requirements" Analytics Job 4972
2181 "Business Requirements" Non-Analytics Job 7740
4151 "Dashboard" Analytics Job 9975
4152 "Dashboard" Non-Analytics Job 1559
4173 "Data Analysis" Analytics Job 25620
6640 "Finance" Analytics Job 5573
6641 "Finance" Non-Analytics Job 6220
13332 "Project Management" Analytics Job 6236
13333 "Project Management" Non-Analytics Job 6939
13594 "Python (Programming Language)" Analytics Job 10127
13595 "Python (Programming Language)" Non-Analytics Job 1703
14459 "SAP Applications" Analytics Job 2174
14460 "SAP Applications" Non-Analytics Job 9149
14686 "SQL (Programming Language)" Analytics Job 16067
14687 "SQL (Programming Language)" Non-Analytics Job 4202
import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Step 2: Clean the data (remove rows with missing values for the variables we need)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'REMOTE_TYPE_NAME', 'MIN_EDULEVELS_NAME'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 4: Encode categorical variables numerically for the parallel coordinates plot
# Encode REMOTE_TYPE_NAME (e.g., Remote=2, Hybrid=1, On-site=0)
remote_mapping = {'Remote': 2, 'Hybrid': 1, 'On-site': 0} # Adjust based on your unique values
df['Remote_Type_Code'] = df['REMOTE_TYPE_NAME'].map(remote_mapping)
# Encode MIN_EDULEVELS_NAME (e.g., High School=0, Bachelor's=1, Master's=2, PhD=3)
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3} # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)
# Step 5: Clean again after encoding (in case some mappings failed)
df = df.dropna(subset=['Remote_Type_Code', 'Edu_Level_Code'])
# Debug: Check the encoded data
print("Sample encoded data:")
print(df[['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code', 'Job_Category']].head())
# Step 6: Create the parallel coordinates plot
fig = px.parallel_coordinates(
df,
dimensions=['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code'],
color_continuous_scale=['#4ECDC4', '#FF6B6B'], # Gradient from teal to red
color=df['Job_Category'].map({'Analytics Job': 1, 'Non-Analytics Job': 0}), # Color by Job_Category
labels={
'SALARY': 'Salary ($)',
'Avg_Years_Experience': 'Avg. Years Experience',
'Remote_Type_Code': 'Remote Type (0=On-site, 1=Hybrid, 2=Remote)',
'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
},
title='Multidimensional Comparison of Job Attributes by Job Category'
)
# Step 7: Update layout for styling
fig.update_layout(
width=1000,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
)
)
fig.show()
fig.write_html(
"figures/edaplot3.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)Number of rows after cleaning: 71254
Sample encoded data:
Empty DataFrame
Columns: [SALARY, Avg_Years_Experience, Remote_Type_Code, Edu_Level_Code, Job_Category]
Index: []
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
# Create the scatter plot with trend line
fig = px.scatter(df,
x='Avg_Years_Experience',
y='SALARY',
color='Job_Category',
trendline='ols', # Add trend line (ordinary least squares)
title='Experience Requirements vs Salary for Analytics Jobs',
labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Average Years of Experience",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title="Salary ($)",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Customize scatter points
fig.update_traces(
marker=dict(
size=8,
opacity=0.7,
line=dict(width=1, color="#2D3748")
)
)
fig.show()
fig.write_html(
"figures/edaplot4.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)